library(dplyr)
library(data.table)
library(stringr)

recode_to_7 <- function(first, second, third, pro_tech_text, anti_tech_text, in_between_text){
  out <- rep(NA, length(df))
  out[first == 'Very close'] <- 1
  out[first == 'Somewhat close'] <- 2
  out[second == pro_tech_text] <- 3
  out[second == in_between_text] <- 4
  out[second == anti_tech_text] <- 5
  
  out[third == "Somewhat close"] <- 6
  out[third == "Very close"] <- 7
  out
}


import_text_wave_1 <- function(){
  df <- fread('Data/text-wave-1.csv', colClasses='character')
  opt_in <- df$Q1.2 == 'I agree to participate'
  mean(opt_in)
  df <- df[opt_in,]
  
  pass_easy <- df$Q2.1 == 15
  mean(pass_easy, na.rm=T)
  
  df <- df[pass_easy,]
  df <- df[df$treatment != '',]
  
  
  pass_att1 <- (df$Q3.6_1 == 'Every day' & df$Q3.6_5=='Never') & 
    rowSums(df[,grepl('Q3.6', colnames(df)), with=F] != '') ==2
  pass_att2 <- (df$Q3.10_5 == 'Classified' & df$Q3.10_12 =='None of the above') & 
    rowSums(df[,grepl('Q3.10', colnames(df)), with=F] != '') ==2
  pass_att3 <- df$grid_screen_3 == 'Neither agreenor disagree'
  
  break_up <- df[,grepl('Q15.9', colnames(df)), with=F]
  break_tech <- rowSums(break_up[,1:5] != '')
  break_placebo <- rowSums(break_up[,6:9] != '')
  
  fav <- df[,grepl( 'Q19.1', colnames(df)), with=F]
  fav <- apply(fav, 2, function(x){ recode(x, 'Very favorable'=7, 
                                           'Favorable'=6, 
                                           'Somewhat favorable'=5,
                                           'Neither favorable nor unfavorable'=4,
                                           'Somewhat unfavorable'=3,
                                           'Unfavorable'=2,
                                           'Very unfavorable'=1,
                                           .default = NA_real_)})
  fav_tech <- rowMeans(fav[,1:5])
  fav_plac <- rowMeans(fav[,6:ncol(fav)])
  
  
  scale <- recode_to_7(first=df$Q15.3, second=df$Q15.7, third=df$Q15.5, 
                       pro_tech_text='The scale and efficiency of large tech companies like Apple, Amazon, Facebook, and Google benefits consumers more than it hurts them',
                       anti_tech_text='Large tech companies like Apple, Amazon, Facebook, and Google use their size to gain an unfair advantage over competitors and disadvantage consumers',
                       in_between_text='Both come equally close to my view')
  
  censorship <- recode_to_7(first=df$Q16.5, second=df$Q16.7, third=df$Q16.3, 
                            pro_tech_text='Social networks like Facebook and Twitter should allow their users to freely express their views, even if it means allowing false, offensive, or harmful  content to circulate',
                            anti_tech_text='Social networks like Facebook and Twitter should do more to remove false, offensive, or harmful content from their platforms',
                            in_between_text='Truly unsure')
  
  privacy <- recode_to_7(first=df$Q17.3, second=df$Q17.7, third=df$Q17.5, 
                         pro_tech_text='Big tech companies do a good job of keeping their users’ information secure',
                         anti_tech_text="Big tech companies do not do a good job of keeping their users’ information secure",
                         in_between_text='Truly unsure')
  
  
  congress <- recode_to_7(first=df$Q17.13, second=df$Q17.15, third=df$Q17.11, 
                          pro_tech_text="Congress should allow large tech companies to store and use data on their users as they see fit",
                          anti_tech_text="Congress should more actively regulate how large tech companies gather and store data on their users",
                          in_between_text='Truly unsure')
  
  
  influence <- recode_to_7(first=df$Q18.4, second=df$Q18.5, third=df$Q18.3, 
                           pro_tech_text="The influence of big tech companies on political life in America is often exaggerated",
                           anti_tech_text="Big tech companies exert too much influence over the political life in America" ,
                           in_between_text='Truly unsure')
  
  
  
  treat_supports <- recode(df$Q13.2,
                           'Definitely supports'=5,
                           'Somewhat supports'=4,
                           'Neither opposes nor supports'=3,
                           'Somewhat opposes'=2,
                           'Definitely opposes'=1,
                           .default = NA_real_)
  
  treat_effective <- recode(df$Q13.4,
                            'Definitely effective'=5,
                            'Effective'=4,
                            'Not sure'=3,
                            'Not effective'=2,
                            'Definitely not effective'=1,
                            .default = NA_real_)
  
  understand_tech <- recode(df$Q13.6,
                            'Very well'=4,
                            'Fairly well'=3,
                            'Not very well'=2,
                            'Not at all'=1,
                            .default = NA_real_)
  
  treat_liberal <- recode(df$Q13.8,
                          'Very liberal'=7,
                          'Somewhat liberal'=6,
                          'Slightly liberal'=5,
                          'Neither liberal nor conservative'=4,
                          'Slightly conservative'=3,
                          'Somewhat conservative'=2,
                          'Very conservative'=1,
                          .default = NA_real_)
  
  
  pid7 <- rep(NA, nrow(df))
  pid7[df$Q20.4 == 'Strong Democrat'] <- 7
  pid7[df$Q20.4 == 'Not very strong Democrat'] <- 6
  
  pid7[df$Q20.5 == 'Strong Republican'] <- 1
  pid7[df$Q20.5 == 'Not very strong Republican'] <- 2
  
  pid7[df$Q20.6 == 'Closer to the Republican Party'] <- 3
  pid7[df$Q20.6 == 'Closer to the Democratic Party'] <- 5
  pid7[df$Q20.6 == 'Neither'] <- 4
  
  pid3_leaner <- rep(NA, nrow(df))
  pid3_leaner[pid7 %in% c(7, 6, 5)] <- "Democrat"
  pid3_leaner[pid7 == 4] <- "Independent"
  pid3_leaner[pid7 %in% c(1, 2, 3)] <- "Republican"
  
  pid3 <- df$Q20.2#,'1'='Democrat', '2'='Independent', '3'='Republican', '4'="Other Party")
  
  
  ideo <- recode(df$Q20.8, 
                 'Very liberal'=1,
                 'Liberal'=2,
                 'Moderate'=3,
                 'Conservative'=4,
                 'Very conservative'=5,
                 .default = NA_real_)
  
  income <- recode(df$Q20.10, 
                   "Less than $10,000"=1,
                   "$10,000 - $19,999"=2,
                   "$20,000 - $29,999" =3,
                   "$30,000 - $39,999"=4,
                   "$40,000 - $49,999"=5,
                   "$50,000 - $59,999"=6,   
                   "$60,000 - $69,999" =7,
                   "$70,000 - $79,999"=8,
                   "$80,000 - $89,999" =9,  
                   "$90,000 - $99,999"=10,
                   "$100,000 - $119,999"=11,
                   "$120,000 - $149,999"=12,
                   "$150,000 - $199,999"=13,
                   "$200,000 - $249,999"=14,
                   "$250,000 - $349,000" =15,           
                   "$350,000 - $499,000"=16,
                   "$500,000 or more" = 17,
                   .default = NA_real_)
  
  age <- 2021 - as.numeric(df$Q3.2)
  edu <- recode(df$Q3.4,
                'Did not graduate from high school'=1,
                'High school'=2,
                'Some college, no degree'=3,
                'Two-year degree'=4,
                'Four-year degree'=5,
                'Graduate degree'=6,
                .default = NA_real_)
  
  man <- df$Q3.12 == 'Man'
  gender <- recode(df$Q3.12, 'Man'='Male', "Woman"="Female", 'Other (please specify)'='Other', .default = NA_character_)
  
  race <- as.data.frame(df[,grepl('Q3.14', colnames(df)), with=F] != '')
  colnames(race) <- c('native-american', 'asian', 'black', 'hispanic', 'white', 'middle-eastern', 'other')
  
  
  
  med_pref <- str_extract(df$Q3.8, 'MSNBC|Fox News|Food Network') 
  
  familiar <- apply(df[,grepl('Q13.10', colnames(df)), with=F], 2, recode, 'Extremely familiar'=5, 'Very familiar'=4,
                    'Moderately familiar'=3, 'Slightly familiar'=2, 'Not familiar at all'=1, .default = NA_real_) %>%
    rowMeans()
  
  outcomes <- data.frame(scale, censorship, privacy, congress, influence, fav_tech,break_tech)
  outcomes_index <- rowMeans(data.frame(1-(scale/7), censorship/7, 1-(privacy/7), 1-(congress/7), 1-(influence/7), fav_tech/7,1-(break_tech/5)))
  complete_df <- df[complete.cases(outcomes),]
  complete_df$outcomes_index <- outcomes_index[!is.na(outcomes_index)]
  complete_df$pc <- -prcomp(outcomes[complete.cases(outcomes),], retx=T)$x[,1]
  complete_df$pc <- complete_df$pc/sd(complete_df$pc)
  if(cor(complete_df$pc,complete_df$outcomes_index)<0){
    complete_df$pc <- -complete_df$pc
  }
  to_return <- data.frame(EndDate=df$EndDate, break_tech, break_placebo, fav_tech, fav_plac, treat_supports, 
                          scale, censorship, privacy, congress, influence, fav_tech, 
                          treat_effective, understand_tech, treat_liberal, pass_att1, pass_att2, pass_att3,age,
                          ideo, pid3, pid7, pid3_leaner, income, race, med_pref, man, gender, race, edu, PID=df$PID, treatment=df$treatment)
  to_return$att <- pass_att1 + pass_att2 + pass_att3
  
  to_return <- to_return[!duplicated(to_return$PID),]
  to_return <- left_join(to_return, complete_df[!duplicated(complete_df$PID),c('pc', 'PID')])
  to_return
}
